#!/bin/bash
#export CUDA_VISIBLE_DEVICES=0,3
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_P2P_DISABLE=1
export NCCL_IB_DISABLE=1
DIR=$(pwd)
# Guide:
# This script supports distributed training on multi-gpu workers (as well as single-worker training).
# Please set the options below according to the comments.
# For multi-gpu workers training, these options should be manually set for each worker.
# After setting the options, please run the script on each worker.

# Number of GPUs per GPU worker
GPUS_PER_NODE=2 # $(python -c 'import torch; print(tor ch.cuda.device_count())')

# Number of GPU workers, for single-worker training, please set to 1
NNODES=${NNODES:-1}

# The rank of this worker, should be in {0, ..., WORKER_CNT-1}, for single-worker training, please set to 0
NODE_RANK=${NODE_RANK:-0}

# The ip address of the rank-0 worker, for single-worker training, please set to localhost
MASTER_ADDR=${MASTER_ADDR:-localhost}

# The port for communication
MASTER_PORT=${MASTER_PORT:-6001}

MODEL="/home/komove/.cache/modelscope/hub/Qwen/Qwen-1_8B-Chat" # Set the path if you do not want to load from huggingface directly
# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
# See the section for finetuning in README for more information.
DATA="/home/komove/shimin/moe/data/train_data/copilot1.1_train.json"
EVALDATA="/home/komove/shimin/moe/data/train_data/copilot1.1_dev.json"
DS_CONFIG_PATH="finetune/ds_config_zero2.json"

function usage() {
  echo 'Usage: bash finetune/finetune_lora_ds.sh [-m MODEL_PATH] [-d DATA_PATH] [--deepspeed DS_CONFIG_PATH]'
}

while [[ "$1" != "" ]]; do
  case $1 in
  -m | --model)
    shift
    MODEL=$1
    ;;
  -d | --data)
    shift
    DATA=$1
    ;;
  --deepspeed)
    shift
    DS_CONFIG_PATH=$1
    ;;
  -h | --help)
    usage
    exit 0
    ;;
  *)
    echo "Unknown argument ${1}"
    exit 1
    ;;
  esac
  shift
done

DISTRIBUTED_ARGS="
    --nproc_per_node $GPUS_PER_NODE \
    --nnodes $NNODES \
    --node_rank $NODE_RANK \
    --master_addr $MASTER_ADDR \
    --master_port $MASTER_PORT
"

torchrun $DISTRIBUTED_ARGS finetune.py \
  --model_name_or_path $MODEL \
  --data_path $DATA \
  --eval_data_path $EVALDATA \
  --bf16 True \
  --output_dir output_qwen1.1 \
  --num_train_epochs 4 \
  --per_device_train_batch_size 8 \
  --per_device_eval_batch_size 8 \
  --gradient_accumulation_steps 8 \
  --evaluation_strategy "epoch" \
  --save_strategy "steps" \
  --save_steps 1000 \
  --save_total_limit 10 \
  --learning_rate 3e-4 \
  --weight_decay 0.1 \
  --adam_beta2 0.95 \
  --warmup_ratio 0.01 \
  --lr_scheduler_type "cosine" \
  --logging_steps 2 \
  --report_to "none" \
  --model_max_length 1000 \
  --lazy_preprocess True \
  --use_lora \
  --gradient_checkpointing \
  --deepspeed ${DS_CONFIG_PATH}
